section .bss
    trailmap resb 256*128

    ; every particle is 6 bytes in RAM: 2x posx, 2x posy, 1x velx, 1x vely
    particles resb 256*6


section .text
    org 0x100             ; Origin for .COM files

;;; clear_trailmap

    ;mov di, trailmap
    ;mov cx, 256*128

    ; L_clear_fill:
    ;    stosb ; pray that ah is 0
    ;    loop L_clear_fill

    ; dummy solution
    mov di, particles ; if clear_trailmap is cut, so L_test_fill doesn't break

;;; end clear_trailmap

;;;; init stuff

    ; Set video mode to 13h (320x200)
    mov al, 0x13 ; pray that ah is 0
    int 0x10

    ; Set our custom ISR for int 0x1c
    mov ax, 0x2500 + 0x1c
    mov dx, I_main
    int 0x21


    ; mov di, particles
    ; di is already after trailmap, which is where particles is
    mov cx, 6*256
    L_test_fill:
        imul ax, 1337
        add ax, 7331
        stosb
        loop L_test_fill


;;;; end init stuff

halt:
    hlt
    jmp halt



I_main: ; called at 18.2 Hz by timer interrupt

        mov ax, ds
        mov es, ax

    ;;;; vertical "blur"
        ; actually just swapping vertical pairs of pixels here and there.
        ; the horizontal step will smooth it out, and it does the job of smearing vertically

        ;; it would be nice if these could cycle mod 6, but oh well, bytes...
        mov si, trailmap
        mov di, si ;saving trailmap address into di

        ;mov cx, 5461 ; floor(128*256/6) ; would be nicer, but would have to handle overflows
        mov cx, 5418 ; floor(127*256/6) ; no overflows at bottom
        L_vert_blur:

            lodsb ; al is top pixel
            xchg al, byte [ds:si+256] ; putting al into bottom pixel and bottom pixel into al
            mov [ds:si], al ; writing bottom pixel back to

            add si, 6

            loop L_vert_blur

    ;;;; end vertical "blur"


    ;;;; horizontal blur

        mov si, di ; di still has trailmap address in it, restoring it into si

        mov cx, 128 ; 128 rows
        L_horiz_blur_row:
            push cx

            mov dl, [si] ; dl is blur_first here
            ; storing for the last pixel

            mov dh, [si+255] ; dh is blur_prev here
            ; wrapped around to the last to be used by the first

            mov cx, 256
            L_horiz_blur_pixel:

                ; doing floor((prev+curr+curr+next) / 4)

                xor ah, ah
                xor bh, bh

                mov al, dh

                mov bl, [si]
                add ax, bx
                add ax, bx

                mov dh, bl


                ; mov bl, dl ; overwritten everywhere except in the last col - therefore optional
                inc si

                cmp cx, 1 ; before the loop instruction, counter is 1 for the last column
                je skipreadnext
                    mov bl, [si] ; shouldn't read if cx is 1! will overflow in the last row, into the particles area
                skipreadnext:

                add ax, bx

                shr ax, 2
                stosb
                loop L_horiz_blur_pixel


            pop cx
            loop L_horiz_blur_row

    ;;;; end horizontal blur

    ;;;; handle particles

        mov cx, 256
        mov si, particles
        L_foreach_particle:
            push cx

            ;;;; sense

                mov ax, [si+2] ; loads velx into al and vely into ah

                sar al, 5 ; velocity is a 2.6 fixed-point number, we need a multiple of it here
                sar ah, 5

                mov bh, al
                mov bl, 0
                sub bl, ah

                ;sar bl, 1 ; sideways half-length ax
                ;sar bh, 1

                mov cx, [si] ; sensor position, starting with particle pos
                add cl, al
                add ch, ah

                add cl, bl
                add ch, bh

                push si

                mov si, cx
                mov dl, [si]


                ; look the other way
                sub cl, bl
                sub ch, bh
                sub cl, bl
                sub ch, bh


                mov si, cx
                mov dh, [si]

                pop si


                ; dl is the trail sample left, dh is the trail sample right


                mov cx, [si+2] ; loads velx into cl vely into ch

                cmp dl, dh
                jg B_turn_left
                jl B_turn_right
                jmp B_turn_end



                B_turn_left:
                    add cl, bl
                    add ch, bh
                    jmp B_turn_end

                B_turn_right:
                    sub cl, bl
                    sub ch, bh

                B_turn_end:


                mov [si+2], cx ; stores cl into velx and ch into vely


            ;;; end sense

            ;;; move

                mov ax, [si+2] ; loads velx into al and vely into ah

                sar al, 6 ; velocity is a 2.6 fixed-point number
                sar ah, 6

                add al, [si]
                add ah, [si+1]

                mov [si], ax ; stores al into posx and ah into posy

            ;;; end move


            ;;; deposit

                ; mov ax, [si] ; loads posx into al and posy into ah - ; ax is already the position
                and ah, 0x7F ; there are only 128 rows
                mov di, trailmap
                add di, ax ; the 256 columns make indexing neat
                mov byte [ds:di], 0xFF-8 ; just set it to high, no overflows (the -8 is because of the rounding while blitting)

            ;;; end deposit

        pop cx
        add si, 4
        loop L_foreach_particle

    ;;;; end handle particles


    ;;;; copy trail map to screen ;;;;

        ; Set ES to video memory segment
        mov ax, 0xA000
        mov es, ax

        ; Calculate offset in video memory to center the bitmap
        ; (320-256)/2 = 32 pixels horizontally
        ; (200-128)/2 = 36 pixels vertically
        mov di, (36*320)+32   ; Destination offset in video memory

        ; Set DS:SI to the source bitmap data
        mov si, trailmap

        mov cx, 128  ; number of rows
        L_copy_row:
            push cx

            mov cx, 256           ; Number of bytes per row to copy
            L_copy_pixel:
                lodsb

                xor ah, ah  ; rounding - optional
                add ax, 8

                shr al, 4 ; 0..255 to 0..15
                add al, 16 ; change to 32 for rainbow

                stosb
                loop L_copy_pixel

            ; Add offset to DI to jump to the next line
            add di, 320-256        ; 320 (screen width) - 256 (bitmap width)

            pop cx
            loop L_copy_row

    ;;;; end copy trail map ;;;;

iret ; I_main is an interrupt handler, so we must return with iret

